import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import researchpy as rp
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
import numpy as np
df = pd.read_csv('C:/Users/As/Desktop/123/USA_cars_datasets.csv')
df.drop('Unnamed: 0',axis = 1, inplace = True)
df.head()
df.describe()
fig = px.treemap(df, path=["brand",'model','color'],
color='brand', hover_data=['model'],
color_continuous_scale='rainbow')
fig.show()
rp.summary_cont(df[['year','mileage','lot','price']])
corr = df.corr()
corr.style.background_gradient(cmap='magma')
temp = pd.DataFrame(df.groupby(['brand']).count()['vin'])
temp.sort_values('vin', ascending = False, inplace = True)
# temp[temp['vin'] > 10].sum().values / temp.sum().values == 0.98239588
# temp[temp['vin'] > 10].count().values / temp.count().values == 0.48148148
brand_list = temp[temp['vin'] > 10].index.values
av_prices = []
for i in brand_list:
x = df[df['brand']==i]
av_price = sum(x.price)/len(x)
av_prices.append(av_price)
data = pd.DataFrame({'brand_list': brand_list,'av_prices':av_prices})
new_index = (data['av_prices'].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)
sns.barplot(y=sorted_data['brand_list'], x=sorted_data['av_prices'], palette = 'GnBu_d')
plt.xlabel('Average Price ($)', fontsize = 14)
plt.ylabel('Brand', fontsize = 14)
plt.title('Average price per brand', fontsize = 16)
plt.show()
price = df.groupby('brand')['price'].max().reset_index()
price = price.sort_values(by="price")
price = price.tail(10)
fig = px.pie(price,
values="price",
names="brand",
template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()
counts = []
for i in brand_list:
x = df[df['brand']==i]
count = len(x.vin)
counts.append(count)
data2 = pd.DataFrame({'brand_list': brand_list,'counts':counts})
new_index2 = (data2['counts'].sort_values(ascending=False)).index.values
sorted_data2 = data2.reindex(new_index2)
sns.barplot(y=sorted_data2['brand_list'], x=sorted_data2['counts'], palette = 'GnBu_d')
plt.xlabel('# of brands', fontsize = 14)
plt.ylabel('Brand', fontsize = 14)
plt.title('Number of brands', fontsize = 16)
plt.show()
color = df.loc[:,["color"]]
color['count'] = color.groupby([color.color])['color'].transform('count')
color = color.drop_duplicates()
color = color.sort_values(by="count",ascending = False)
color = color.head(10)
fig = px.pie(color,
values="count",
names="color",
template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()
sorted_cars = car_brands.sort_values(by=[('price','count')],ascending=False)
top_six = sorted_cars[0:6]
top_six
plt.figure(figsize=(15,7))
plt.bar(x=top_six.index,height=top_six['price']['count'],alpha=.6,color=sns.color_palette('dark'))
plt.ylim(0,1500)
plt.title('Top Six Car Brands',fontsize=15,fontweight='bold')
plt.xlabel('Car Brand',fontsize=13,color='b')
plt.ylabel('Quantity',fontsize=13,color='b');
df['count'] = 1
brand_list = temp[temp['vin'] > 20].index.values
cars_b = df[np.in1d(df['brand'],brand_list)]
c_sun = px.sunburst(cars_b, path = ['brand','model'], values = 'count', color = 'price',
width = 800, height = 800, color_continuous_scale = 'twilight')
df.drop(columns = 'count', inplace = True)
c_sun.show()
#top brand= Ford
perb= df.loc[:,["year","brand"]]
perb['count'] = perb.groupby([perb.brand,perb.year])['brand'].transform('count')
perb= perb.drop_duplicates()
perb= perb.sort_values(by="year",ascending = False)
top_brand = ['ford', 'dodge',"nissan",'chevrolet']
perb = perb.loc[perb['brand'].isin(top_brand)]
perb = perb[perb.year>2015]
perb = perb.sort_values(by="year")
fig=px.bar(perb,x='brand', y="count", animation_frame="year",
animation_group="brand", color="brand", hover_name="brand")
fig.show()
import plotly.express as px
fig = px.scatter(df, x="year", y="price", color="brand",
size='price')
fig.show()
import plotly.express as px
cars_by_model_year = df.groupby('year')['model'].count().reset_index().sort_values('model',ascending = False)
cars_by_model_year = cars_by_model_year[cars_by_model_year['year'] >= 2010]
cars_by_model_year = cars_by_model_year.rename(columns = {'model':'count'})
fig = px.bar(cars_by_model_year, x='year', y='count', color='count')
fig.show()
# ------------------------------------------------------
# PLOTTING THE BREAKDOWN OF THE MORE PREVALENT YEARS
# ------------------------------------------------------
fig = plt.figure(figsize=(12,5))
ax = fig.add_axes([0,0,1,1])
sns.countplot(x='year', data=df, ax=ax);
import matplotlib.pyplot as plt
x_values = df.price
fig, ax = plt.subplots(figsize=(17, 7))
plt.scatter(x_values, range(len(x_values)))
plt.show()
fig, ax = plt.subplots(figsize=(17, 7))
ax1 = sns.distplot(df.price)
plt.axvline(np.mean(df.price), color='black', linestyle='dashed', linewidth=5);
data = df[['price','year']]
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='year', y="price", data=data)
#scatterplot
sns.set()
sns.pairplot(df, size = 3)
plt.show()
def compare_2_groups(arr_1, arr_2, alpha, sample_size):
stat, p = ttest_ind(arr_1, arr_2)
print('Statistics=%.3f, p=%.3f' % (stat, p))
if p > alpha:
print('Same distributions (fail to reject H0)')
else:
print('Different distributions (reject H0)')
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
sample_size=15
ger_sampled = df.price
fr_sampled = df.mileage
compare_2_groups(df.price,df.mileage, 0.05, sample_size)
import scipy
scipy.stats.ttest_rel(df.price,df.mileage)
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(17, 7))
ax.hist(df.price,color='indigo');
ax.set_xlabel('Result');
ax.set_title('price of brand cars');
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(17, 7))
ax.hist(df.year,color='indigo');
ax.set_xlabel('Result');
ax.set_title('sale of cars group by year');
import scipy.stats as stats
stats.f_oneway(df.price,df.mileage)
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.feature_selection import chi2
## function to get to know our data
def understand_variables(dataset):
print(type(dataset))
print(dataset.shape)
print(dataset.head())
print(dataset.columns)
print(dataset.nunique(axis=0))
print(dataset.describe())
print(dataset.describe(exclude=[np.number]))
## function to deal with outliers
def outlier_processing(dataset):
# Using IQR
Q1 = dataset.quantile(0.25)
Q3 = dataset.quantile(0.75)
IQR = Q3 - Q1
#outlier_col = ['year']
print("\n-------------\n% of outliers\n")
print(((dataset < (Q1 - 1.5 * IQR)) |(dataset > (Q3 + 1.5 * IQR))).sum()/len(dataset)*100)
for col in list(IQR.index):
dataset.loc[dataset[col] < (Q1 - 1.5 * IQR)[col],[col]] = (Q1 - 1.5 * IQR)[col]
dataset.loc[dataset[col] > (Q3 + 1.5 * IQR)[col],[col]] = (Q3 + 1.5 * IQR)[col]
dataset[col] = dataset[col].round(0).astype(int)
return dataset
cars_dataset =df
# dropping unnecessary columns
cars_dataset = cars_dataset.drop(["lot","vin"],axis=1)
understand_variables(cars_dataset)
################ Feature engineering ###########
######### convert year to age (2020 - year)
cars_dataset.year = 2020 - cars_dataset.year
######## condition column : [Listings expired = 0, remove 'left' from others, convert everything to minutess ]
cars_dataset.loc[cars_dataset.condition == "Listing Expired", 'condition'] = "0 minutes left"
cars_dataset['condition'] = cars_dataset.condition.str.replace("left","")
cars_dataset.loc[cars_dataset.condition.str.contains("minutes"),'condition'] = (cars_dataset.loc[cars_dataset.condition.str.contains("minutes"),'condition'].astype(str).str.split().str[0].astype(int)).astype(str)
cars_dataset.loc[cars_dataset.condition.str.contains("hours"),'condition'] = (cars_dataset.loc[cars_dataset.condition.str.contains("hours"),'condition'].astype(str).str.split().str[0].astype(int) * 60).astype(str)
cars_dataset.loc[cars_dataset.condition.str.contains("days"),'condition'] = (cars_dataset.loc[cars_dataset.condition.str.contains("days"),'condition'].astype(str).str.split().str[0].astype(int) * 60*24).astype(str)
cars_dataset.condition = cars_dataset.condition.astype(int)
######## dealing with outliers ########
cars_dataset = outlier_processing(cars_dataset)
####### get dummies ########
cars_dataset = pd.get_dummies(cars_dataset, dummy_na=True)
############# Linear regression ##############
X = cars_dataset.drop("price",axis=1)
y = cars_dataset["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred = pd.Series(np.absolute(y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#print("coef_pval:\n", stats.coef_pval(reg, X_train, y_train))
print("R_2 = " +str(r2_score(y_test, y_pred)*100)+" %")
scores, pvalues = chi2(X_train, y_train)
scores = pd.Series(scores)
pvalues = pd.Series(pvalues)
sig_p_val = pvalues[pvalues<0.05]
cars_col_index = sig_p_val.index
cars_col_index = pd.Series(cars_col_index)
cars_dataset = cars_dataset.iloc[:,cars_col_index]
print("Retained columns : " + str(cars_dataset.columns))
X = cars_dataset.drop("price",axis=1)
y = cars_dataset["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred = pd.Series(np.absolute(y_pred))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
#print("coef_pval:\n", stats.coef_pval(reg, X_train, y_train))
print("R_2 = " +str(r2_score(y_test, y_pred)*100)+" %")
import pingouin as pg
pg.corr(x=df['year'], y=df['price'])
from statsmodels.tsa.statespace import sarimax
model = sarimax.SARIMAX(df["price"], order=(1,1,1), seasonal_order=(1,1,0,12), enforce_invertibility=True)
results = model.fit()
results.summary()
predictions_int = results.get_forecast(steps=11)
predictions_int.predicted_mean
x = df["year"].values.reshape(-1,1)
y = df["price"].values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train) #training the algorithm
#To retrieve the intercept:
print(regressor.intercept_)
#For retrieving the slope:
print(regressor.coef_)
y_pred = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
df
df1 = df.head(19)
df1.plot(kind='bar',figsize=(14,6),color=("limegreen","gold"))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
plt.scatter(X_test, y_test, color='gold')
plt.plot(X_test, y_pred, color='limegreen', linewidth=2)
sns.set(color_codes=True)
sns.jointplot( x = X_test, y = y_test, kind="reg");
plt.show()
from sklearn import svm
svr=svm.SVR(gamma='auto')
svr.fit(X_train, y_train.ravel())
y_predict=svr.predict(X_test)
y_predict.shape
r2_score(y_test,y_predict)
from sklearn import tree
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X_train,y_train)
r2_score(clf.predict(X_test),y_test)
import scipy.stats as stats
df1=data
# stats f_oneway functions takes the groups as input and returns F and P-value
fvalue, pvalue = stats.f_oneway(df.price,df.mileage)
print(fvalue, pvalue)
# get ANOVA table as R like output
import statsmodels.api as sm
from statsmodels.formula.api import ols
# reshape the d dataframe suitable for statsmodels package
d_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['year', 'price', 'mileage'])
# replace column names
d_melt.columns = ['index', 'Years', 'value']
# Ordinary Least Squares (OLS) model
model = ols('value ~ C(Years)', data=d_melt).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table
# load packages
from statsmodels.stats.multicomp import pairwise_tukeyhsd
# perform multiple pairwise comparison (Tukey HSD)
m_comp = pairwise_tukeyhsd(endog=d_melt['value'], groups=d_melt['Years'], alpha=0.05)
print(m_comp)
print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}")